{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from lxml.html import fromstring\n",
    "import time\n",
    "from random import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [],
   "source": [
    "# coding=utf-8\n",
    "from selenium import webdriver\n",
    "import time\n",
    "\n",
    "wd = webdriver.Chrome()\n",
    "wd.get(\"https://www.baidu.com\")    # 打开百度浏览器\n",
    "wd.find_element_by_id(\"kw\").send_keys(\"selenium\")   # 定位输入框并输入关键字\n",
    "wd.find_element_by_id(\"su\").click()   #点击[百度一下]搜索\n",
    "time.sleep(3)   #等待3秒\n",
    "wd.quit()   #关闭浏览器"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-103-2b46ef088d5e>:19: DeprecationWarning: use options instead of chrome_options\n",
      "  driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=cap\n"
     ]
    }
   ],
   "source": [
    "from selenium import webdriver\n",
    "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n",
    "\n",
    "\n",
    "#caps=dict()\n",
    "#caps[\"pageLoadStrategy\"] = \"none\"   # Do not wait for full page load\n",
    "\n",
    "opts = webdriver.ChromeOptions()\n",
    "opts.add_argument('--no-sandbox')#解决DevToolsActivePort文件不存在的报错\n",
    "opts.add_argument('window-size=1920x3000') #指定浏览器分辨率\n",
    "opts.add_argument('--disable-gpu') #谷歌文档提到需要加上一这个属性来规避bug\n",
    "opts.add_argument('--hide-scrollbars') #隐藏滚动条, 应对些特殊页面\n",
    "#opts.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度\n",
    "#opts.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败\n",
    "# opts.binary_location = \"C:\\portable\\PortableApps\\IronPortable\\App\\Iron\\chrome.exe\"\n",
    "# opts.binary_location = \"C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe\" #\"H:\\_coding_\\Gitee\\InternetNewMedia\\CapstonePrj2016\\chromedriver.exe\"  \n",
    "\n",
    "\n",
    "driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=cap"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.get(\"https://www.cnki.net/\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 点击高级检索"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_id('highSearch').click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 检查窗口位置"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'CDwindow-770606C07179C4588D274553E3544661'"
      ]
     },
     "execution_count": 106,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "driver.current_window_handle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['CDwindow-770606C07179C4588D274553E3544661',\n",
       " 'CDwindow-581E4D7F5A229CAD508A77D2C8596B32']"
      ]
     },
     "execution_count": 107,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "driver.window_handles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-108-6c6d5ce6602d>:1: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 点击学术期刊"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//ul[@class=\"doctype-menus keji\"]/li[@data-id=\"xsqk\"]/a').click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 点击高级检索"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_name('majorSearch').click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 勾选期刊类型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [],
   "source": [
    "#SCI\n",
    "driver.find_element_by_xpath('//div[@class=\"extend-tit-labels\"]//input[@key=\"SI\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [],
   "source": [
    "#EI\n",
    "driver.find_element_by_xpath('//div[@class=\"extend-tit-labels\"]//input[@key=\"EI\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [],
   "source": [
    "#北大核心\n",
    "driver.find_element_by_xpath('//div[@class=\"extend-tit-labels\"]//input[@key=\"HX\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [],
   "source": [
    "#CSSCI\n",
    "driver.find_element_by_xpath('//div[@class=\"extend-tit-labels\"]//input[@key=\"CSI\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [],
   "source": [
    "#CSCD\n",
    "driver.find_element_by_xpath('//div[@class=\"extend-tit-labels\"]//input[@key=\"CSD\"]').click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 填写query"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [],
   "source": [
    "NW_互联网_query = \"SU=互联网 and SU=新媒体\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//textarea')\n",
    "element.clear()\n",
    "element.send_keys(NW_互联网_query)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 点击检索"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//input[@value=\"检索\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 显示文章数量\n",
    "driver.find_element_by_id('perPageDiv').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 选择50\n",
    "driver.find_element_by_xpath('//li[@data-val=\"50\"]/a').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>篇名</th>\n",
       "      <th>作者</th>\n",
       "      <th>刊名</th>\n",
       "      <th>发表时间</th>\n",
       "      <th>被引</th>\n",
       "      <th>下载</th>\n",
       "      <th>操作</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>互联网新媒体传播中农村职业培训脱贫致富基本内涵的逻辑生成</td>\n",
       "      <td>杨宗晓; 杨克</td>\n",
       "      <td>农业经济</td>\n",
       "      <td>2021-06-04</td>\n",
       "      <td>NaN</td>\n",
       "      <td>82.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>新媒体与高校思想政治教育研究——评《新媒体视域下高校思想政治教育的解读与重构》</td>\n",
       "      <td>高倩楠</td>\n",
       "      <td>中国测试</td>\n",
       "      <td>2021-05-31</td>\n",
       "      <td>NaN</td>\n",
       "      <td>16.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>论我国网上理论阵地的传播态势与发展大势</td>\n",
       "      <td>王凤翔</td>\n",
       "      <td>湖南大学学报(社会科学版)</td>\n",
       "      <td>2021-05-28</td>\n",
       "      <td>NaN</td>\n",
       "      <td>31.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>新媒体广告生态下互联网营销服务平台的赋能逻辑——以巨量引擎为例</td>\n",
       "      <td>李红妮</td>\n",
       "      <td>传媒</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>60.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>新媒体技术与体育运动传播的融合路径——评《新媒体与体育传播》</td>\n",
       "      <td>鲁俊华</td>\n",
       "      <td>中国科技论文</td>\n",
       "      <td>2021-05-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6</td>\n",
       "      <td>高校学生沟通机制探究——以北京某高校为例</td>\n",
       "      <td>何家唯; 张权</td>\n",
       "      <td>学校党建与思想教育</td>\n",
       "      <td>2021-05-08</td>\n",
       "      <td>NaN</td>\n",
       "      <td>69.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>7</td>\n",
       "      <td>后疫情时代改善与重塑国家形象的新媒体传播策略</td>\n",
       "      <td>匡文波; 马茜茜</td>\n",
       "      <td>新闻与写作</td>\n",
       "      <td>2021-05-05</td>\n",
       "      <td>NaN</td>\n",
       "      <td>475.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>8</td>\n",
       "      <td>互联网背景下大学公共英语教学效率提升研究——评《网络与新媒体专业英语教程》</td>\n",
       "      <td>赵鹏</td>\n",
       "      <td>中国广播电视学刊</td>\n",
       "      <td>2021-05-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>21.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>9</td>\n",
       "      <td>以互联网为平台开展艾滋病防治的优势和挑战</td>\n",
       "      <td>赵好; 刘惠; 韩孟杰</td>\n",
       "      <td>中国艾滋病性病</td>\n",
       "      <td>2021-04-26</td>\n",
       "      <td>NaN</td>\n",
       "      <td>90.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10</td>\n",
       "      <td>我国互联网糖尿病健康信息资源的合理应用及挑战</td>\n",
       "      <td>汪洋; 王觅也; 周有莲; 何龙韬; 李舍予</td>\n",
       "      <td>中国全科医学</td>\n",
       "      <td>2021-04-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>137.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>11</td>\n",
       "      <td>“构建网上网下一体、内宣外宣联动的主流舆论格局”——基于习近平相关论述的分析</td>\n",
       "      <td>步新娜; 魏继昆</td>\n",
       "      <td>党的文献</td>\n",
       "      <td>2021-04-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>90.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>12</td>\n",
       "      <td>我国互联网电视管理政策法规解析</td>\n",
       "      <td>赵珊珊</td>\n",
       "      <td>传媒</td>\n",
       "      <td>2021-04-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>30.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>13</td>\n",
       "      <td>融媒体时代电视新闻节目的发展</td>\n",
       "      <td>张慧</td>\n",
       "      <td>当代电视</td>\n",
       "      <td>2021-04-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>70.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>14</td>\n",
       "      <td>以讲好中国故事为着力点，创新推进国际传播</td>\n",
       "      <td>朱新梅</td>\n",
       "      <td>中国广播电视学刊</td>\n",
       "      <td>2021-04-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>295.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>15</td>\n",
       "      <td>“互联网+”背景下企业新媒体营销策略研究——评《新媒体营销与策划》</td>\n",
       "      <td>张会娟</td>\n",
       "      <td>商业经济研究</td>\n",
       "      <td>2021-03-22</td>\n",
       "      <td>NaN</td>\n",
       "      <td>715.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>16</td>\n",
       "      <td>运动服装品牌跨界营销策略优化研究</td>\n",
       "      <td>孙湉; 沈雷</td>\n",
       "      <td>毛纺科技</td>\n",
       "      <td>2021-03-16</td>\n",
       "      <td>NaN</td>\n",
       "      <td>332.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>17</td>\n",
       "      <td>“互联网+”背景下传统学术期刊的发展策略——以基础教育类教师期刊为例</td>\n",
       "      <td>杨强</td>\n",
       "      <td>编辑学刊</td>\n",
       "      <td>2021-03-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>33.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>18</td>\n",
       "      <td>媒体脸谱</td>\n",
       "      <td>NaN</td>\n",
       "      <td>青年记者</td>\n",
       "      <td>2021-03-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>19</td>\n",
       "      <td>广电新媒体融合的“变与不变”——以江苏广电荔枝新闻为例</td>\n",
       "      <td>王智勇</td>\n",
       "      <td>中国广播电视学刊</td>\n",
       "      <td>2021-03-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>38.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>20</td>\n",
       "      <td>地方都市报的转型路径和突围策略——以山西《生活晨报》为例</td>\n",
       "      <td>张凡桢; 佟霏</td>\n",
       "      <td>出版广角</td>\n",
       "      <td>2021-02-28</td>\n",
       "      <td>NaN</td>\n",
       "      <td>13.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>21</td>\n",
       "      <td>媒介使用对地方政府信任的作用机制研究</td>\n",
       "      <td>帅满; 罗家德; 郭孟伦</td>\n",
       "      <td>国际新闻界</td>\n",
       "      <td>2021-02-23</td>\n",
       "      <td>NaN</td>\n",
       "      <td>727.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>22</td>\n",
       "      <td>互联网时代青年思政教育模式创新探索——评《新媒体视角下大学生思政教育创新探索》</td>\n",
       "      <td>王晓先</td>\n",
       "      <td>中国高校科技</td>\n",
       "      <td>2021-02-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>91.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>23</td>\n",
       "      <td>新媒体艺术传播对乡村景观艺术设计的影响——推荐《新媒体艺术导论》</td>\n",
       "      <td>李一霏; 杨荣华; 万妍彦</td>\n",
       "      <td>新闻记者</td>\n",
       "      <td>2021-02-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>66.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>24</td>\n",
       "      <td>态势、议题与路径——互联网时代的传媒经济研究</td>\n",
       "      <td>谭天; 杨冬旭</td>\n",
       "      <td>新闻爱好者</td>\n",
       "      <td>2021-02-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>141.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>25</td>\n",
       "      <td>网络时代民族团结教育研究——铸牢中华民族共同体意识研究系列论文之一</td>\n",
       "      <td>王卓</td>\n",
       "      <td>广西民族研究</td>\n",
       "      <td>2021-02-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>168.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>26</td>\n",
       "      <td>媒介变迁视野下旅游体验分享建构目的地形象研究</td>\n",
       "      <td>汪东亮</td>\n",
       "      <td>社会科学家</td>\n",
       "      <td>2021-02-05</td>\n",
       "      <td>NaN</td>\n",
       "      <td>277.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>27</td>\n",
       "      <td>浅析广播电视新闻评论在网络媒体中的新常态运用</td>\n",
       "      <td>李节; 钟强</td>\n",
       "      <td>当代电视</td>\n",
       "      <td>2021-02-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>108.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>28</td>\n",
       "      <td>地方媒体:区域深耕与全国化布局</td>\n",
       "      <td>NaN</td>\n",
       "      <td>青年记者</td>\n",
       "      <td>2021-01-30</td>\n",
       "      <td>NaN</td>\n",
       "      <td>26.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>29</td>\n",
       "      <td>接受新旧媒体死死生生的常态</td>\n",
       "      <td>詹新惠</td>\n",
       "      <td>青年记者</td>\n",
       "      <td>2021-01-30</td>\n",
       "      <td>NaN</td>\n",
       "      <td>22.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>30</td>\n",
       "      <td>运用现代信息技术 构建思政教学体系——评《基于互联网云平台的高职思想政治理论课教学方法改革研究》</td>\n",
       "      <td>张铭</td>\n",
       "      <td>山西财经大学学报</td>\n",
       "      <td>2021-01-26 10:21</td>\n",
       "      <td>NaN</td>\n",
       "      <td>287.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>31</td>\n",
       "      <td>“互联网+”下新媒体营销对农村经济发展的促进作用</td>\n",
       "      <td>任淑华; 贾培瑶; 赵忆岚</td>\n",
       "      <td>核农学报</td>\n",
       "      <td>2021-01-25</td>\n",
       "      <td>1.0</td>\n",
       "      <td>536.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>32</td>\n",
       "      <td>新媒体语境下马克思主义大众传播研究——推荐《基于新媒体技术的马克思主义传播》</td>\n",
       "      <td>瞿静</td>\n",
       "      <td>新闻记者</td>\n",
       "      <td>2021-01-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>168.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>33</td>\n",
       "      <td>新媒体环境下实体书店发展路径探析——以十点书店为例</td>\n",
       "      <td>张之晔; 李常庆</td>\n",
       "      <td>出版广角</td>\n",
       "      <td>2021-01-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>300.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>34</td>\n",
       "      <td>智能驱动:认清新业态,促进真融合</td>\n",
       "      <td>初令伟; 谭天</td>\n",
       "      <td>青年记者</td>\n",
       "      <td>2021-01-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>191.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>35</td>\n",
       "      <td>从传媒巨变看新闻教育</td>\n",
       "      <td>汪伟</td>\n",
       "      <td>青年记者</td>\n",
       "      <td>2021-01-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>113.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>36</td>\n",
       "      <td>网络的法律地位:行政确认与《民法典》法律界定</td>\n",
       "      <td>陆小华</td>\n",
       "      <td>山西大学学报(哲学社会科学版)</td>\n",
       "      <td>2021-01-15</td>\n",
       "      <td>2.0</td>\n",
       "      <td>154.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>37</td>\n",
       "      <td>“国潮”与“真我”:互联网时代青年群体的自我呈现</td>\n",
       "      <td>邢海燕</td>\n",
       "      <td>西南民族大学学报(人文社会科学版)</td>\n",
       "      <td>2021-01-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1341.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>38</td>\n",
       "      <td>新媒体时代“四力”的突围与跨越——基于“十三五”时期中国新媒体发展的几个焦点</td>\n",
       "      <td>李明德; 赵琛</td>\n",
       "      <td>编辑之友</td>\n",
       "      <td>2021-01-05</td>\n",
       "      <td>1.0</td>\n",
       "      <td>589.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>39</td>\n",
       "      <td>构建网络内容治理主体协同机制的作用与优化路径</td>\n",
       "      <td>谢新洲; 宋琢</td>\n",
       "      <td>新闻与写作</td>\n",
       "      <td>2021-01-05</td>\n",
       "      <td>1.0</td>\n",
       "      <td>374.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>40</td>\n",
       "      <td>网络化时代营销传播渠道的冲突及优化之策</td>\n",
       "      <td>郑带利</td>\n",
       "      <td>商业经济研究</td>\n",
       "      <td>2020-12-21</td>\n",
       "      <td>1.0</td>\n",
       "      <td>363.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>41</td>\n",
       "      <td>浅析“互联网+”背景下农产品新媒体营销研究</td>\n",
       "      <td>李明</td>\n",
       "      <td>食品工业</td>\n",
       "      <td>2020-12-20</td>\n",
       "      <td>1.0</td>\n",
       "      <td>371.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>42</td>\n",
       "      <td>新媒体时代传统主流媒体的“破圈”与启示——以“四川观察”为例</td>\n",
       "      <td>卢雪尧</td>\n",
       "      <td>青年记者</td>\n",
       "      <td>2020-12-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>687.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42</th>\n",
       "      <td>43</td>\n",
       "      <td>互联网背景下网络和新媒体技术对文学发展的影响研究</td>\n",
       "      <td>潘裕仙</td>\n",
       "      <td>食品研究与开发</td>\n",
       "      <td>2020-12-20</td>\n",
       "      <td>1.0</td>\n",
       "      <td>159.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>44</td>\n",
       "      <td>新媒体产业资本流通与价值转移的影响机制研究——以网络视听行业为例</td>\n",
       "      <td>王建磊</td>\n",
       "      <td>新闻大学</td>\n",
       "      <td>2020-12-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>438.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>45</td>\n",
       "      <td>新媒体时代数字化阅读推广创新模式——评《数字化阅读设计》</td>\n",
       "      <td>唐远锋</td>\n",
       "      <td>中国科技论文</td>\n",
       "      <td>2020-12-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>70.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>46</td>\n",
       "      <td>互联网驱动下中华饮食文化变迁影响机制</td>\n",
       "      <td>唐东平; 苏彩虹</td>\n",
       "      <td>美食研究</td>\n",
       "      <td>2020-12-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>228.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>47</td>\n",
       "      <td>国有媒体服务功能拓展的类型和关键</td>\n",
       "      <td>郭全中</td>\n",
       "      <td>青年记者</td>\n",
       "      <td>2020-12-10</td>\n",
       "      <td>2.0</td>\n",
       "      <td>55.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>48</td>\n",
       "      <td>新型广播的探索与实践</td>\n",
       "      <td>阎晓明</td>\n",
       "      <td>中国广播电视学刊</td>\n",
       "      <td>2020-12-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>55.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>49</td>\n",
       "      <td>短视频趋势下政务新媒体困境与进路</td>\n",
       "      <td>何海翔</td>\n",
       "      <td>中国出版</td>\n",
       "      <td>2020-12-01</td>\n",
       "      <td>1.0</td>\n",
       "      <td>910.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>50</td>\n",
       "      <td>信息渠道如何影响商业养老保险决策——来自CGSS的微观证据</td>\n",
       "      <td>彭魏倬加</td>\n",
       "      <td>中南大学学报(社会科学版)</td>\n",
       "      <td>2020-11-26</td>\n",
       "      <td>NaN</td>\n",
       "      <td>375.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Unnamed: 0                                                篇名  \\\n",
       "0            1                      互联网新媒体传播中农村职业培训脱贫致富基本内涵的逻辑生成   \n",
       "1            2           新媒体与高校思想政治教育研究——评《新媒体视域下高校思想政治教育的解读与重构》   \n",
       "2            3                               论我国网上理论阵地的传播态势与发展大势   \n",
       "3            4                   新媒体广告生态下互联网营销服务平台的赋能逻辑——以巨量引擎为例   \n",
       "4            5                    新媒体技术与体育运动传播的融合路径——评《新媒体与体育传播》   \n",
       "5            6                              高校学生沟通机制探究——以北京某高校为例   \n",
       "6            7                            后疫情时代改善与重塑国家形象的新媒体传播策略   \n",
       "7            8             互联网背景下大学公共英语教学效率提升研究——评《网络与新媒体专业英语教程》   \n",
       "8            9                              以互联网为平台开展艾滋病防治的优势和挑战   \n",
       "9           10                            我国互联网糖尿病健康信息资源的合理应用及挑战   \n",
       "10          11            “构建网上网下一体、内宣外宣联动的主流舆论格局”——基于习近平相关论述的分析   \n",
       "11          12                                   我国互联网电视管理政策法规解析   \n",
       "12          13                                    融媒体时代电视新闻节目的发展   \n",
       "13          14                              以讲好中国故事为着力点，创新推进国际传播   \n",
       "14          15                 “互联网+”背景下企业新媒体营销策略研究——评《新媒体营销与策划》   \n",
       "15          16                                  运动服装品牌跨界营销策略优化研究   \n",
       "16          17                “互联网+”背景下传统学术期刊的发展策略——以基础教育类教师期刊为例   \n",
       "17          18                                              媒体脸谱   \n",
       "18          19                       广电新媒体融合的“变与不变”——以江苏广电荔枝新闻为例   \n",
       "19          20                      地方都市报的转型路径和突围策略——以山西《生活晨报》为例   \n",
       "20          21                                媒介使用对地方政府信任的作用机制研究   \n",
       "21          22           互联网时代青年思政教育模式创新探索——评《新媒体视角下大学生思政教育创新探索》   \n",
       "22          23                  新媒体艺术传播对乡村景观艺术设计的影响——推荐《新媒体艺术导论》   \n",
       "23          24                            态势、议题与路径——互联网时代的传媒经济研究   \n",
       "24          25                 网络时代民族团结教育研究——铸牢中华民族共同体意识研究系列论文之一   \n",
       "25          26                            媒介变迁视野下旅游体验分享建构目的地形象研究   \n",
       "26          27                            浅析广播电视新闻评论在网络媒体中的新常态运用   \n",
       "27          28                                   地方媒体:区域深耕与全国化布局   \n",
       "28          29                                     接受新旧媒体死死生生的常态   \n",
       "29          30  运用现代信息技术 构建思政教学体系——评《基于互联网云平台的高职思想政治理论课教学方法改革研究》   \n",
       "30          31                          “互联网+”下新媒体营销对农村经济发展的促进作用   \n",
       "31          32            新媒体语境下马克思主义大众传播研究——推荐《基于新媒体技术的马克思主义传播》   \n",
       "32          33                         新媒体环境下实体书店发展路径探析——以十点书店为例   \n",
       "33          34                                  智能驱动:认清新业态,促进真融合   \n",
       "34          35                                        从传媒巨变看新闻教育   \n",
       "35          36                            网络的法律地位:行政确认与《民法典》法律界定   \n",
       "36          37                          “国潮”与“真我”:互联网时代青年群体的自我呈现   \n",
       "37          38            新媒体时代“四力”的突围与跨越——基于“十三五”时期中国新媒体发展的几个焦点   \n",
       "38          39                            构建网络内容治理主体协同机制的作用与优化路径   \n",
       "39          40                               网络化时代营销传播渠道的冲突及优化之策   \n",
       "40          41                             浅析“互联网+”背景下农产品新媒体营销研究   \n",
       "41          42                    新媒体时代传统主流媒体的“破圈”与启示——以“四川观察”为例   \n",
       "42          43                          互联网背景下网络和新媒体技术对文学发展的影响研究   \n",
       "43          44                  新媒体产业资本流通与价值转移的影响机制研究——以网络视听行业为例   \n",
       "44          45                      新媒体时代数字化阅读推广创新模式——评《数字化阅读设计》   \n",
       "45          46                                互联网驱动下中华饮食文化变迁影响机制   \n",
       "46          47                                  国有媒体服务功能拓展的类型和关键   \n",
       "47          48                                        新型广播的探索与实践   \n",
       "48          49                                  短视频趋势下政务新媒体困境与进路   \n",
       "49          50                     信息渠道如何影响商业养老保险决策——来自CGSS的微观证据   \n",
       "\n",
       "                        作者                 刊名              发表时间   被引      下载  \\\n",
       "0                  杨宗晓; 杨克               农业经济        2021-06-04  NaN    82.0   \n",
       "1                      高倩楠               中国测试        2021-05-31  NaN    16.0   \n",
       "2                      王凤翔      湖南大学学报(社会科学版)        2021-05-28  NaN    31.0   \n",
       "3                      李红妮                 传媒        2021-05-25  NaN    60.0   \n",
       "4                      鲁俊华             中国科技论文        2021-05-15  NaN     NaN   \n",
       "5                  何家唯; 张权          学校党建与思想教育        2021-05-08  NaN    69.0   \n",
       "6                 匡文波; 马茜茜              新闻与写作        2021-05-05  NaN   475.0   \n",
       "7                       赵鹏           中国广播电视学刊        2021-05-01  NaN    21.0   \n",
       "8              赵好; 刘惠; 韩孟杰            中国艾滋病性病        2021-04-26  NaN    90.0   \n",
       "9   汪洋; 王觅也; 周有莲; 何龙韬; 李舍予             中国全科医学        2021-04-20  NaN   137.0   \n",
       "10                步新娜; 魏继昆               党的文献        2021-04-15  NaN    90.0   \n",
       "11                     赵珊珊                 传媒        2021-04-10  NaN    30.0   \n",
       "12                      张慧               当代电视        2021-04-01  NaN    70.0   \n",
       "13                     朱新梅           中国广播电视学刊        2021-04-01  NaN   295.0   \n",
       "14                     张会娟             商业经济研究        2021-03-22  NaN   715.0   \n",
       "15                  孙湉; 沈雷               毛纺科技        2021-03-16  NaN   332.0   \n",
       "16                      杨强               编辑学刊        2021-03-15  NaN    33.0   \n",
       "17                     NaN               青年记者        2021-03-15  NaN     7.0   \n",
       "18                     王智勇           中国广播电视学刊        2021-03-01  NaN    38.0   \n",
       "19                 张凡桢; 佟霏               出版广角        2021-02-28  NaN    13.0   \n",
       "20            帅满; 罗家德; 郭孟伦              国际新闻界        2021-02-23  NaN   727.0   \n",
       "21                     王晓先             中国高校科技        2021-02-20  NaN    91.0   \n",
       "22           李一霏; 杨荣华; 万妍彦               新闻记者        2021-02-20  NaN    66.0   \n",
       "23                 谭天; 杨冬旭              新闻爱好者        2021-02-20  NaN   141.0   \n",
       "24                      王卓             广西民族研究        2021-02-20  NaN   168.0   \n",
       "25                     汪东亮              社会科学家        2021-02-05  NaN   277.0   \n",
       "26                  李节; 钟强               当代电视        2021-02-01  NaN   108.0   \n",
       "27                     NaN               青年记者        2021-01-30  NaN    26.0   \n",
       "28                     詹新惠               青年记者        2021-01-30  NaN    22.0   \n",
       "29                      张铭           山西财经大学学报  2021-01-26 10:21  NaN   287.0   \n",
       "30           任淑华; 贾培瑶; 赵忆岚               核农学报        2021-01-25  1.0   536.0   \n",
       "31                      瞿静               新闻记者        2021-01-20  NaN   168.0   \n",
       "32                张之晔; 李常庆               出版广角        2021-01-15  NaN   300.0   \n",
       "33                 初令伟; 谭天               青年记者        2021-01-15  NaN   191.0   \n",
       "34                      汪伟               青年记者        2021-01-15  NaN   113.0   \n",
       "35                     陆小华    山西大学学报(哲学社会科学版)        2021-01-15  2.0   154.0   \n",
       "36                     邢海燕  西南民族大学学报(人文社会科学版)        2021-01-10  NaN  1341.0   \n",
       "37                 李明德; 赵琛               编辑之友        2021-01-05  1.0   589.0   \n",
       "38                 谢新洲; 宋琢              新闻与写作        2021-01-05  1.0   374.0   \n",
       "39                     郑带利             商业经济研究        2020-12-21  1.0   363.0   \n",
       "40                      李明               食品工业        2020-12-20  1.0   371.0   \n",
       "41                     卢雪尧               青年记者        2020-12-20  NaN   687.0   \n",
       "42                     潘裕仙            食品研究与开发        2020-12-20  1.0   159.0   \n",
       "43                     王建磊               新闻大学        2020-12-15  NaN   438.0   \n",
       "44                     唐远锋             中国科技论文        2020-12-15  NaN    70.0   \n",
       "45                唐东平; 苏彩虹               美食研究        2020-12-15  NaN   228.0   \n",
       "46                     郭全中               青年记者        2020-12-10  2.0    55.0   \n",
       "47                     阎晓明           中国广播电视学刊        2020-12-01  NaN    55.0   \n",
       "48                     何海翔               中国出版        2020-12-01  1.0   910.0   \n",
       "49                    彭魏倬加      中南大学学报(社会科学版)        2020-11-26  NaN   375.0   \n",
       "\n",
       "    操作  \n",
       "0   下载  \n",
       "1   下载  \n",
       "2   下载  \n",
       "3   下载  \n",
       "4   下载  \n",
       "5   下载  \n",
       "6   下载  \n",
       "7   下载  \n",
       "8   下载  \n",
       "9   下载  \n",
       "10  下载  \n",
       "11  下载  \n",
       "12  下载  \n",
       "13  下载  \n",
       "14  下载  \n",
       "15  下载  \n",
       "16  下载  \n",
       "17  下载  \n",
       "18  下载  \n",
       "19  下载  \n",
       "20  下载  \n",
       "21  下载  \n",
       "22  下载  \n",
       "23  下载  \n",
       "24  下载  \n",
       "25  下载  \n",
       "26  下载  \n",
       "27  下载  \n",
       "28  下载  \n",
       "29  下载  \n",
       "30  下载  \n",
       "31  下载  \n",
       "32  下载  \n",
       "33  下载  \n",
       "34  下载  \n",
       "35  下载  \n",
       "36  下载  \n",
       "37  下载  \n",
       "38  下载  \n",
       "39  下载  \n",
       "40  下载  \n",
       "41  下载  \n",
       "42  下载  \n",
       "43  下载  \n",
       "44  下载  \n",
       "45  下载  \n",
       "46  下载  \n",
       "47  下载  \n",
       "48  下载  \n",
       "49  下载  "
      ]
     },
     "execution_count": 121,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "element = driver.find_element_by_id('gridTable')\n",
    "page_html = element.get_attribute('innerHTML')\n",
    "pd.read_html(page_html)[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 翻页"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'1/48'"
      ]
     },
     "execution_count": 136,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//span[@class=\"countPageMark\"]')\n",
    "page_str = element.get_attribute('innerHTML')\n",
    "page_str"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "metadata": {},
   "outputs": [],
   "source": [
    "# global varialbes \n",
    "html_raw = dict()\n",
    "main_content =\"\"\n",
    "element = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "metadata": {},
   "outputs": [],
   "source": [
    "pages = list(range(1,17))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_pages (pages):\n",
    "    for p in pages:\n",
    "        print (p,end='\\t')\n",
    "        跳转 = driver.find_element_by_id('PageNext')\n",
    "        跳转.click()\n",
    "        time.sleep(5+13*random())\n",
    "        element = driver.find_element_by_id('gridTable')\n",
    "        main_content = element.get_attribute('innerHTML')\n",
    "        html_raw[p] = main_content\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t11\t12\t13\t14\t15\t16\t"
     ]
    }
   ],
   "source": [
    "process_pages(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>html_snippets</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        html_snippets\n",
       "1   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "2   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "3   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "4   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "5   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "6   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "7   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "8   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "9   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "10  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "11  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "12  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "13  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "14  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "15  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "16  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ..."
      ]
     },
     "execution_count": 143,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame([html_raw]).T\n",
    "df.columns = [\"html_snippets\"]\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 存"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "metadata": {},
   "outputs": [],
   "source": [
    "网站 = \"中国知网\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "metadata": {},
   "outputs": [],
   "source": [
    "fn = { \"output\" : {  \"htm_snippets\": \"data_raw_src/知网_htm_snippets_{网站}.tsv\"}}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 157,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = fn [\"output\"] [\"htm_snippets\"] \n",
    "df.to_csv(filename.format(网站=网站), sep=\"\\t\", encoding=\"utf8\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "metadata": {},
   "outputs": [],
   "source": [
    "l_df = []\n",
    "for p in pages:\n",
    "    表格 = pd.read_html(html_raw[p])[0]\n",
    "    l_df.append(表格)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 161,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>篇名</th>\n",
       "      <th>作者</th>\n",
       "      <th>刊名</th>\n",
       "      <th>发表时间</th>\n",
       "      <th>被引</th>\n",
       "      <th>下载</th>\n",
       "      <th>操作</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>互联网新媒体传播中农村职业培训脱贫致富基本内涵的逻辑生成</td>\n",
       "      <td>杨宗晓; 杨克</td>\n",
       "      <td>农业经济</td>\n",
       "      <td>2021-06-04</td>\n",
       "      <td>NaN</td>\n",
       "      <td>82.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>新媒体与高校思想政治教育研究——评《新媒体视域下高校思想政治教育的解读与重构》</td>\n",
       "      <td>高倩楠</td>\n",
       "      <td>中国测试</td>\n",
       "      <td>2021-05-31</td>\n",
       "      <td>NaN</td>\n",
       "      <td>16.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>论我国网上理论阵地的传播态势与发展大势</td>\n",
       "      <td>王凤翔</td>\n",
       "      <td>湖南大学学报(社会科学版)</td>\n",
       "      <td>2021-05-28</td>\n",
       "      <td>NaN</td>\n",
       "      <td>31.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>新媒体广告生态下互联网营销服务平台的赋能逻辑——以巨量引擎为例</td>\n",
       "      <td>李红妮</td>\n",
       "      <td>传媒</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>60.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>新媒体技术与体育运动传播的融合路径——评《新媒体与体育传播》</td>\n",
       "      <td>鲁俊华</td>\n",
       "      <td>中国科技论文</td>\n",
       "      <td>2021-05-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>795</th>\n",
       "      <td>846</td>\n",
       "      <td>让报道受欢迎,新华社新媒体用了这些“妙招”</td>\n",
       "      <td>马书平</td>\n",
       "      <td>中国记者</td>\n",
       "      <td>2017-04-01</td>\n",
       "      <td>1.0</td>\n",
       "      <td>137.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>796</th>\n",
       "      <td>847</td>\n",
       "      <td>法国互联网产业的发展与新趋向</td>\n",
       "      <td>张伟</td>\n",
       "      <td>中国记者</td>\n",
       "      <td>2017-04-01</td>\n",
       "      <td>2.0</td>\n",
       "      <td>257.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>797</th>\n",
       "      <td>848</td>\n",
       "      <td>阿基米德:打造传统广播转型平台</td>\n",
       "      <td>王海滨</td>\n",
       "      <td>中国广播电视学刊</td>\n",
       "      <td>2017-04-01</td>\n",
       "      <td>4.0</td>\n",
       "      <td>136.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>798</th>\n",
       "      <td>849</td>\n",
       "      <td>杭州交通经济广播融合发展探索</td>\n",
       "      <td>蒋新琴; 魏琦</td>\n",
       "      <td>中国广播电视学刊</td>\n",
       "      <td>2017-04-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>64.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>799</th>\n",
       "      <td>850</td>\n",
       "      <td>互联网思维下思想政治教育价值实现途径</td>\n",
       "      <td>李晓容</td>\n",
       "      <td>中学政治教学参考</td>\n",
       "      <td>2017-03-30</td>\n",
       "      <td>2.0</td>\n",
       "      <td>131.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>850 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Unnamed: 0                                       篇名       作者  \\\n",
       "0             1             互联网新媒体传播中农村职业培训脱贫致富基本内涵的逻辑生成  杨宗晓; 杨克   \n",
       "1             2  新媒体与高校思想政治教育研究——评《新媒体视域下高校思想政治教育的解读与重构》      高倩楠   \n",
       "2             3                      论我国网上理论阵地的传播态势与发展大势      王凤翔   \n",
       "3             4          新媒体广告生态下互联网营销服务平台的赋能逻辑——以巨量引擎为例      李红妮   \n",
       "4             5           新媒体技术与体育运动传播的融合路径——评《新媒体与体育传播》      鲁俊华   \n",
       "..          ...                                      ...      ...   \n",
       "795         846                    让报道受欢迎,新华社新媒体用了这些“妙招”      马书平   \n",
       "796         847                           法国互联网产业的发展与新趋向       张伟   \n",
       "797         848                          阿基米德:打造传统广播转型平台      王海滨   \n",
       "798         849                           杭州交通经济广播融合发展探索  蒋新琴; 魏琦   \n",
       "799         850                       互联网思维下思想政治教育价值实现途径      李晓容   \n",
       "\n",
       "                刊名        发表时间   被引     下载  操作  \n",
       "0             农业经济  2021-06-04  NaN   82.0  下载  \n",
       "1             中国测试  2021-05-31  NaN   16.0  下载  \n",
       "2    湖南大学学报(社会科学版)  2021-05-28  NaN   31.0  下载  \n",
       "3               传媒  2021-05-25  NaN   60.0  下载  \n",
       "4           中国科技论文  2021-05-15  NaN    NaN  下载  \n",
       "..             ...         ...  ...    ...  ..  \n",
       "795           中国记者  2017-04-01  1.0  137.0  下载  \n",
       "796           中国记者  2017-04-01  2.0  257.0  下载  \n",
       "797       中国广播电视学刊  2017-04-01  4.0  136.0  下载  \n",
       "798       中国广播电视学刊  2017-04-01  NaN   64.0  下载  \n",
       "799       中学政治教学参考  2017-03-30  2.0  131.0  下载  \n",
       "\n",
       "[850 rows x 8 columns]"
      ]
     },
     "execution_count": 161,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out = pd.concat(l_df).reset_index(drop=True)\n",
    "df_总表 = pd.read_html(page_html)[0].append(df_url_out)\n",
    "df_总表"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 162,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>篇名</th>\n",
       "      <th>作者</th>\n",
       "      <th>刊名</th>\n",
       "      <th>发表时间</th>\n",
       "      <th>被引</th>\n",
       "      <th>下载</th>\n",
       "      <th>操作</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>互联网新媒体传播中农村职业培训脱贫致富基本内涵的逻辑生成</td>\n",
       "      <td>杨宗晓; 杨克</td>\n",
       "      <td>农业经济</td>\n",
       "      <td>2021-06-04</td>\n",
       "      <td>NaN</td>\n",
       "      <td>82.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>新媒体与高校思想政治教育研究——评《新媒体视域下高校思想政治教育的解读与重构》</td>\n",
       "      <td>高倩楠</td>\n",
       "      <td>中国测试</td>\n",
       "      <td>2021-05-31</td>\n",
       "      <td>NaN</td>\n",
       "      <td>16.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>论我国网上理论阵地的传播态势与发展大势</td>\n",
       "      <td>王凤翔</td>\n",
       "      <td>湖南大学学报(社会科学版)</td>\n",
       "      <td>2021-05-28</td>\n",
       "      <td>NaN</td>\n",
       "      <td>31.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>新媒体广告生态下互联网营销服务平台的赋能逻辑——以巨量引擎为例</td>\n",
       "      <td>李红妮</td>\n",
       "      <td>传媒</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>60.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>新媒体技术与体育运动传播的融合路径——评《新媒体与体育传播》</td>\n",
       "      <td>鲁俊华</td>\n",
       "      <td>中国科技论文</td>\n",
       "      <td>2021-05-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>795</th>\n",
       "      <td>846</td>\n",
       "      <td>让报道受欢迎,新华社新媒体用了这些“妙招”</td>\n",
       "      <td>马书平</td>\n",
       "      <td>中国记者</td>\n",
       "      <td>2017-04-01</td>\n",
       "      <td>1.0</td>\n",
       "      <td>137.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>796</th>\n",
       "      <td>847</td>\n",
       "      <td>法国互联网产业的发展与新趋向</td>\n",
       "      <td>张伟</td>\n",
       "      <td>中国记者</td>\n",
       "      <td>2017-04-01</td>\n",
       "      <td>2.0</td>\n",
       "      <td>257.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>797</th>\n",
       "      <td>848</td>\n",
       "      <td>阿基米德:打造传统广播转型平台</td>\n",
       "      <td>王海滨</td>\n",
       "      <td>中国广播电视学刊</td>\n",
       "      <td>2017-04-01</td>\n",
       "      <td>4.0</td>\n",
       "      <td>136.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>798</th>\n",
       "      <td>849</td>\n",
       "      <td>杭州交通经济广播融合发展探索</td>\n",
       "      <td>蒋新琴; 魏琦</td>\n",
       "      <td>中国广播电视学刊</td>\n",
       "      <td>2017-04-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>64.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>799</th>\n",
       "      <td>850</td>\n",
       "      <td>互联网思维下思想政治教育价值实现途径</td>\n",
       "      <td>李晓容</td>\n",
       "      <td>中学政治教学参考</td>\n",
       "      <td>2017-03-30</td>\n",
       "      <td>2.0</td>\n",
       "      <td>131.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>850 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Unnamed: 0                                       篇名       作者  \\\n",
       "0             1             互联网新媒体传播中农村职业培训脱贫致富基本内涵的逻辑生成  杨宗晓; 杨克   \n",
       "1             2  新媒体与高校思想政治教育研究——评《新媒体视域下高校思想政治教育的解读与重构》      高倩楠   \n",
       "2             3                      论我国网上理论阵地的传播态势与发展大势      王凤翔   \n",
       "3             4          新媒体广告生态下互联网营销服务平台的赋能逻辑——以巨量引擎为例      李红妮   \n",
       "4             5           新媒体技术与体育运动传播的融合路径——评《新媒体与体育传播》      鲁俊华   \n",
       "..          ...                                      ...      ...   \n",
       "795         846                    让报道受欢迎,新华社新媒体用了这些“妙招”      马书平   \n",
       "796         847                           法国互联网产业的发展与新趋向       张伟   \n",
       "797         848                          阿基米德:打造传统广播转型平台      王海滨   \n",
       "798         849                           杭州交通经济广播融合发展探索  蒋新琴; 魏琦   \n",
       "799         850                       互联网思维下思想政治教育价值实现途径      李晓容   \n",
       "\n",
       "                刊名        发表时间   被引     下载  操作  \n",
       "0             农业经济  2021-06-04  NaN   82.0  下载  \n",
       "1             中国测试  2021-05-31  NaN   16.0  下载  \n",
       "2    湖南大学学报(社会科学版)  2021-05-28  NaN   31.0  下载  \n",
       "3               传媒  2021-05-25  NaN   60.0  下载  \n",
       "4           中国科技论文  2021-05-15  NaN    NaN  下载  \n",
       "..             ...         ...  ...    ...  ..  \n",
       "795           中国记者  2017-04-01  1.0  137.0  下载  \n",
       "796           中国记者  2017-04-01  2.0  257.0  下载  \n",
       "797       中国广播电视学刊  2017-04-01  4.0  136.0  下载  \n",
       "798       中国广播电视学刊  2017-04-01  NaN   64.0  下载  \n",
       "799       中学政治教学参考  2017-03-30  2.0  131.0  下载  \n",
       "\n",
       "[850 rows x 8 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "with pd.ExcelWriter('知网文章数据.xlsx',mode='w',engine=\"openpyxl\") as writer:  \n",
    "            df_总表.to_excel(writer,sheet_name=\"知网\")\n",
    "display(df_总表)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
